result <- logistic(
bbb,
rvar = "buyer",
evar = c(
"gender", "last", "total", "child", "youth",
"cook", "do_it", "reference", "art", "geog"
),
lev = "yes"
)
summary(result)Logistic regression (GLM)
Data : bbb
Response variable : buyer
Level : yes in buyer
Explanatory variables: gender, last, total, child, youth, cook, do_it, reference, art, geog
Null hyp.: there is no effect of x on buyer
Alt. hyp.: there is an effect of x on buyer
OR OR% coefficient std.error z.value p.value
(Intercept) -2.361 0.049 -47.891 < .001 ***
gender|M 2.140 114.0% 0.761 0.036 21.272 < .001 ***
last 0.910 -9.0% -0.095 0.003 -33.918 < .001 ***
total 1.001 0.1% 0.001 0.000 5.630 < .001 ***
child 0.830 -17.0% -0.186 0.017 -10.775 < .001 ***
youth 0.893 -10.7% -0.113 0.026 -4.327 < .001 ***
cook 0.763 -23.7% -0.270 0.017 -15.782 < .001 ***
do_it 0.583 -41.7% -0.539 0.027 -19.994 < .001 ***
reference 1.265 26.5% 0.235 0.027 8.837 < .001 ***
art 3.176 217.6% 1.156 0.022 52.185 < .001 ***
geog 1.776 77.6% 0.574 0.019 30.824 < .001 ***
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Pseudo R-squared: 0.205
Log-likelihood: -12061.106, AIC: 24144.211, BIC: 24241.229
Chi-squared: 6233.253 df(10), p.value < .001
Nr obs: 50,000
Logistic regression (GLM)
Data : bbb
Response variable : buyer
Level(s) : yes in buyer
Explanatory variables: gender, last, total, child, youth, cook, do_it, reference, art, geog
Interval : confidence
Prediction dataset : bbb
Rows shown : 10 of 50,000
gender last total child youth cook do_it reference art geog Prediction 2.5% 97.5%
M 29 357 3 2 2 0 1 0 2 0.020 0.017 0.024
M 27 138 0 1 0 1 0 0 1 0.017 0.015 0.019
F 15 172 0 0 2 0 0 0 0 0.016 0.014 0.017
F 7 272 0 0 0 0 1 0 0 0.077 0.071 0.083
F 15 149 0 0 1 0 0 0 0 0.020 0.019 0.022
F 7 113 0 1 0 0 0 0 0 0.047 0.044 0.051
M 25 15 0 0 0 1 0 0 0 0.011 0.010 0.013
M 1 238 2 1 2 3 0 0 3 0.087 0.075 0.101
F 5 418 0 2 3 2 0 3 1 0.391 0.353 0.431
F 11 123 0 1 0 0 0 0 0 0.033 0.031 0.036
Function to calculate the TPR and TNR at different trade-off values (aka break-even values)
slow_roc <- function(outcome, pred, cost, margin) {
tbl <- tibble::tibble(
cost = cost, margin = margin, BE = cost / margin,
TP = NA, FP = NA, TN = NA, FN = NA, TNR = NA, TPR = NA
)
for (i in seq_along(cost)) {
BEi <- as.numeric(tbl[i,"BE"])
TP <- sum(pred > BEi & outcome == TRUE)
FP <- sum(pred > BEi & outcome == FALSE)
TN <- sum(pred <= BEi & outcome == FALSE)
FN <- sum(pred <= BEi & outcome == TRUE)
TPR <- TP / (TP + FN)
TNR <- TN / (TN + FP)
tbl[i, 4:9] <- c(TP, FP, TN, FN, TNR, TPR)
}
tbl
}Creating the data for the ROC curve
outcome <- bbb$buyer == "yes"
pred <- bbb$pred_logit
roc_data <- slow_roc(
outcome, pred,
cost = seq(0, 6, 0.05), margin = 6
)
register("roc_data")Plotting the ROC curve
ggplot(roc_data, aes(x = TNR, y = TPR)) +
geom_line() +
scale_x_reverse() +
geom_abline(intercept = 1, slope = 1, linetype = "dashed") +
labs(x = "TNR (Specificity)", y = "TPR (Sensitivity)")Calculating the TPR and TNR for the break-even point in the BBB case
p <- ggplot(roc_data, aes(x = TNR, y = TPR)) +
geom_line() +
scale_x_reverse() +
geom_abline(intercept = 1, slope = 1, linetype = "dashed") +
geom_point(data = bbb_to, aes(x = TNR, y = TPR), color = "red", size = 3) +
labs(x = "TNR (Specificity)", y = "TPR (Sensitivity)")
pInteractive version using ggplotly
visualize(
roc_data,
xvar = "BE",
yvar = c("TP", "FP", "TN", "FN"),
comby = TRUE,
type = "line",
data_filter = "TP > 0",
custom = FALSE
)See https://www.alexejgossmann.com/auc/ for a very nice dicsussion
## Adapted from Alexej's code
s <- 0
did_buy <- which(outcome == TRUE)
did_not_buy <- which(outcome == FALSE)
for (i in did_buy) {
s <- s + sum(pred[i] > pred[did_not_buy])
s <- s + sum(pred[i] == pred[did_not_buy]) / 2
}
s / (sum(outcome == TRUE) * sum(outcome == FALSE))[1] 0.8117416
Lets compare that result to what we would get with a formal calculation:
[1] 0.8117416
Lets try a sampling approach
[1] 0.816
[1] 0.812
Lets do repeated simulation
s <- rep(NA, 100)
for (i in seq_along(s)) {
s[i] <- mean(
pred[sample(did_buy, 2000)] >
pred[sample(did_not_buy, 2000)]
)
}
mean(s)[1] 0.81164